library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
activity <- read.csv(unz("activity.zip", "activity.csv"), colClasses = list(date = "Date"))

# Convert the intervals column into a factor, 
# so that during plotting/processing we won't treat them as integer numbers.
# (ie. we make the distance between 1155 and 1200 equal to the distance between 1145 and 1150)
# Also make the interval labels a bit more human-readable.
activity$interval <- factor(sprintf("%04d", activity$interval))

Missing values / imputation

ggplot(summarize(group_by(activity, date), miss_count = sum(is.na(steps))), aes(date, miss_count)) + geom_bar(stat = "identity")

summarize(group_by(activity, date), miss_count = sum(is.na(steps)))$miss_count
##  [1] 288   0   0   0   0   0   0 288   0   0   0   0   0   0   0   0   0
## [18]   0   0   0   0   0   0   0   0   0   0   0   0   0   0 288   0   0
## [35] 288   0   0   0   0 288 288   0   0   0 288   0   0   0   0   0   0
## [52]   0   0   0   0   0   0   0   0   0 288
summarize(group_by(activity, interval), miss_count = sum(is.na(steps)))$miss_count
##   [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
##  [36] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
##  [71] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [106] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [141] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [176] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [211] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [246] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [281] 8 8 8 8 8 8 8 8

Are close to zeroes a miss too?

ggplot(summarize(group_by(activity, date), s = sum(steps)), aes(date, s)) + geom_line(group = 1)
## Warning: Removed 2 rows containing missing values (geom_path).

sfrm <- summarize(group_by(activity, date), S = sum(steps, na.rm = T), missing = sum(is.na(steps)) == 288)
ggplot(sfrm, aes(date, S)) + geom_line() + geom_point(aes(color = missing), size = 3) + scale_color_manual(values = c("black", "orange"))

Are misses/lows timeseries-dependent?

ggplot(sfrm, aes(x = S, y = lag(S, 1), color = missing)) + geom_point(size = 3) + scale_color_manual(values = c("black", "orange")) + stat_smooth(method = lm)
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(sfrm, aes(x = S, y = lag(S, 1))) + geom_point(size = 3) + scale_color_manual(values = c("black", "orange")) + stat_smooth(method = lm)
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

How does the time series look like if we impute the average day

sort(sfrm$S)
##  [1]     0     0     0     0     0     0     0     0    41   126  2492
## [12]  3219  4472  5018  5441  6778  7047  7336  8334  8355  8821  8841
## [23]  8918  9819  9900 10056 10119 10139 10183 10304 10395 10439 10571
## [34] 10600 10765 11015 11162 11352 11458 11829 11834 12116 12426 12608
## [45] 12787 12811 12883 13294 13452 13460 13646 14339 14478 15084 15098
## [56] 15110 15414 15420 17382 20427 21194
sfrm$low.or.0 <- sfrm$S <= 126
M <- mean(sfrm$S[!sfrm$low.or.0])
sfrm$imp <- ifelse(sfrm$low.or.0, M, sfrm$S)
ggplot(sfrm, aes(date, imp)) + geom_line() + geom_point(aes(color = missing), size = 3) + scale_color_manual(values = c("black", "orange"))

Interploation would we nicer maybe, but oh well…

Are missings and zeros/lows consistent on the interval-series level?

plot(activity$step)

activity$timestamp <- as.POSIXct(strptime(paste(activity$date, activity$interval), format = "%Y-%m-%d %H%M", tz = "UTC"))
ggplot(activity, aes(timestamp, steps)) + geom_line()
## Warning: Removed 576 rows containing missing values (geom_path).

arrange(filter(sfrm, low.or.0), date)
## Source: local data frame [10 x 5]
## 
##          date   S missing low.or.0      imp
## 1  2012-10-01   0    TRUE     TRUE 11185.12
## 2  2012-10-02 126   FALSE     TRUE 11185.12
## 3  2012-10-08   0    TRUE     TRUE 11185.12
## 4  2012-11-01   0    TRUE     TRUE 11185.12
## 5  2012-11-04   0    TRUE     TRUE 11185.12
## 6  2012-11-09   0    TRUE     TRUE 11185.12
## 7  2012-11-10   0    TRUE     TRUE 11185.12
## 8  2012-11-14   0    TRUE     TRUE 11185.12
## 9  2012-11-15  41   FALSE     TRUE 11185.12
## 10 2012-11-30   0    TRUE     TRUE 11185.12
frm <- filter(activity, date %in% as.Date(c("2012-10-01", "2012-10-02", "2012-10-03")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_path).
## Warning: Removed 288 rows containing missing values (geom_point).

frm <- filter(activity, date %in% as.Date(c("2012-10-07", "2012-10-08", "2012-10-09")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_point).

frm <- filter(activity, date %in% as.Date(c("2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", "2012-11-05")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 576 rows containing missing values (geom_point).

frm <- filter(activity, date %in% as.Date(c("2012-11-08", "2012-11-09", "2012-11-10", "2012-11-11", "2012-11-12")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 576 rows containing missing values (geom_point).

frm <- filter(activity, date %in% as.Date(c("2012-11-13", "2012-11-14", "2012-11-15", "2012-11-16")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_point).

frm <- filter(activity, date %in% as.Date(c("2012-11-28", "2012-11-29", "2012-11-30")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_path).
## Warning: Removed 288 rows containing missing values (geom_point).

8 vs 10 rm

# Helper dataset for missing-value analysis
activity.missings <- summarize(group_by(activity, date),
                               total_steps = sum(steps),
                               na_count = sum(is.na(steps)),
                               non_zero_count = sum(steps != 0))
# Add some helper columns and backfill NA's for plotting
activity.missings$na.or.low <- (activity.missings$na_count == 288 | activity.missings$non_zero_count <= 2)
activity.missings[is.na(activity.missings)] <- 0
activity.missings$is.monday <- weekdays(activity.missings$date) == "Monday"
invalid.days <- activity.missings$date[activity.missings$na.or.low]
summarize(group_by(activity, interval), avg_steps = mean(steps, na.rm = TRUE))
## Source: local data frame [288 x 2]
## 
##    interval avg_steps
## 1      0000 1.7169811
## 2      0005 0.3396226
## 3      0010 0.1320755
## 4      0015 0.1509434
## 5      0020 0.0754717
## 6      0025 2.0943396
## 7      0030 0.5283019
## 8      0035 0.8679245
## 9      0040 0.0000000
## 10     0045 1.4716981
## ..      ...       ...
hist(summarize(group_by(activity, interval), avg_steps = mean(steps, na.rm = TRUE))$avg_steps)

summarize(group_by(filter(activity, !date %in% invalid.days), interval), avg_steps = mean(steps))
## Source: local data frame [288 x 2]
## 
##    interval  avg_steps
## 1      0000 1.78431373
## 2      0005 0.35294118
## 3      0010 0.13725490
## 4      0015 0.15686275
## 5      0020 0.07843137
## 6      0025 2.17647059
## 7      0030 0.54901961
## 8      0035 0.90196078
## 9      0040 0.00000000
## 10     0045 1.52941176
## ..      ...        ...
hist(summarize(group_by(filter(activity, !date %in% invalid.days), interval), avg_steps = mean(steps))$avg_steps)